'''
Created on Sep 10, 2012

@author: kingsfield
'''
import re
from collections import defaultdict
from util import Util, Constant


def get_words(text): return re.findall('[a-z]+', text.lower())

words = set(get_words(file('big.txt').read())) 

train_file = '/home/kingsfield/Desktop/BestBuyPc/old_train.csv'
out_file = '/home/kingsfield/Desktop/BestBuyPc/words_spell.csv'

def get_word_count():
    with open(train_file) as fr:
        raw_data = fr.readlines()
    queries = [line.split(',')[3] for line in raw_data]
    word_count = defaultdict(int)
    for query in queries:
        get_words = Util.wordutil.get_words(query)
        for w in get_words:
            word_count[w] += 1
    return word_count

if __name__ == '__main__':
    word_count = get_word_count()
    rank = sorted(word_count.items(), key=lambda x:x[1], reverse=True)
    with open(out_file, 'w') as fw:
        for w, count in rank:
            if w not in words and not w.isdigit():
                fw.write(' '.join([w, str(count)]) + '\n')
    print 'done'
